For this assignment, you will use the reinforcement learning algorithm, Q learning, with a neural network to approximate the Q function. You will apply this to the game Tic-Tac-Toe and to the control of a simple robot arm.
Most of the code is provided. You are asked to make specific modifications and find parameter values that result in good performance on these tasks. The two tasks will probably require different parameter values.
Download necessary code from ttt_arm.zip.
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import pandas as pd
import pickle
import tictactoe
class Game:
def __init__(self, environment, agents):
self.env = environment
self.agents = agents
def train(self, parms, verbose=True):
n_batches = parms['n_batches']
n_games_per_batch = parms['n_games_per_batch']
n_epochs = parms['n_epochs']
method = parms['method']
learning_rate = parms['learning_rate']
epsilon = parms['initial_epsilon']
final_epsilon = parms['final_epsilon']
ttt = self.env
epsilon_decay = np.exp((np.log(final_epsilon) - np.log(epsilon)) / (n_batches)) # to produce this final value
epsilon_trace = []
outcomes = []
for batch in range(n_batches):
agents['X'].clear_samples()
agents['O'].clear_samples()
for gamei in range(n_games_per_batch):
ttt.initialize()
done = False
while not done:
agent = agents[ttt.player]
obs = ttt.observe()
if len(self.env.valid_actions()) == 9:
action = np.random.choice(self.env.valid_actions())
# print('picked random action at start of game')
else:
action = agent.epsilon_greedy(epsilon)
# print('picked best action')
ttt.act(action)
r = ttt.reinforcement()
done = ttt.terminal_state()
agent.add_sample(obs, action, r, done)
outcomes.append(r)
# end n_trials_per_batch
self.agents['X'].train(n_epochs, method, learning_rate)
self.agents['O'].train(n_epochs, method, learning_rate)
epsilon_trace.append(epsilon)
epsilon *= epsilon_decay
if verbose and (len(outcomes) % ((n_batches * n_games_per_batch) // 20) == 0):
print(f'{len(outcomes)}/{n_batches * n_games_per_batch} games, {np.mean(outcomes):.2f} outcome mean')
if verbose:
plt.subplot(3, 1, 1)
n_per = 10
n_bins = len(outcomes) // n_per
outcomes_binned = np.array(outcomes).reshape(-1, n_per)
avgs = outcomes_binned.mean(1)
xs = np.linspace(n_per, n_per * n_bins, len(avgs))
plt.plot(xs, avgs)
plt.axhline(y=0, color='orange', ls='--')
plt.ylabel('R')
plt.subplot(3, 1, 2)
plt.plot(xs, np.sum(outcomes_binned == -1, axis=1), 'r-', label='O Wins')
plt.plot(xs, np.sum(outcomes_binned == 0, axis=1), 'b-', label='Draws')
plt.plot(xs, np.sum(outcomes_binned == 1, axis=1), 'g-', label='X Wins')
plt.legend(loc='center')
plt.ylabel(f'Number of Games\nin Bins of {n_per:d}')
plt.subplot(3, 1, 3)
plt.plot(epsilon_trace)
plt.ylabel(r'$\epsilon$')
return outcomes, epsilon_trace
def play_game(self, epsilon=0.0, verbose=True):
ttt = self.env
agents = self.agents
ttt.initialize()
while True:
agent = agents[ttt.player]
obs = ttt.observe()
if len(ttt.valid_actions()) == 9:
action = agent.epsilon_greedy(epsilon=1.0)
else:
action = agent.epsilon_greedy(epsilon)
ttt.act(action)
if verbose:
print(ttt)
if ttt.terminal_state():
return ttt.reinforcement()
def play_game_show_Q(self, epsilon=0.0):
ttt = self.env
agents = self.agents
step = 0
ttt.initialize()
while True:
agent = agents[ttt.player]
obs = ttt.observe()
actions = ttt.valid_actions()
if len(ttt.valid_actions()) == 9:
action = agent.epsilon_greedy(epsilon=1.0)
else:
action = agent.epsilon_greedy(epsilon)
ttt.act(action)
step += 1
plt.subplot(5, 2, step)
Qs = np.array([agent.use(np.hstack((obs, a))) for a in actions])
board_image = np.array([np.nan] * 9)
for Q, a in zip(Qs, actions):
board_image[a] = Q[0, 0]
board_image = board_image.reshape(3, 3)
maxmag = np.nanmax(np.abs(board_image))
plt.imshow(board_image, cmap='coolwarm', vmin=-maxmag, vmax=maxmag)
plt.colorbar()
obs = ttt.observe()
i = -1
for row in range(3):
for col in range(3):
i += 1
if obs[i] == 1:
plt.text(col, row, 'X', ha='center',
fontweight='bold', fontsize='large', color='black')
elif obs[i] == -1:
plt.text(col, row, 'O', ha='center',
fontweight='bold', fontsize='large', color='black')
plt.axis('off')
if ttt.terminal_state():
break
plt.tight_layout()
ttt = tictactoe.TicTacToe()
nh = [10]
agents = {'X': tictactoe.QnetAgent(ttt, nh, 'max'),
'O': tictactoe.QnetAgent(ttt, nh, 'min')}
game = Game(ttt, agents)
game.play_game(0)
X| | ----- | | ------ | | X| | ----- O| | ------ | | X|X| ----- O| | ------ | | X|X| ----- O|O| ------ | | X|X|X ----- O|O| ------ | |
1
previous_best = -np.inf
results = []
for nb in [100, 500]:
for ng in [5, 10, 20]:
for ne in [2, 5, 10, 20]:
for nh in [ [], [50], [50, 50, 50] ]:
parms = {
'n_batches': nb,
'n_games_per_batch': ng,
'n_epochs': ne,
'method': 'scg',
'learning_rate': 0.01,
'initial_epsilon': 1.0,
'final_epsilon': 0.01,
'gamma': 1.0
}
agents = {'X': tictactoe.QnetAgent(ttt, nh, 'max'),
'O': tictactoe.QnetAgent(ttt, nh, 'min')}
game = Game(ttt, agents)
outcomes, _ = game.train(parms, verbose=False)
mean_outcomes = np.mean(outcomes)
results.append([nh, nb, ng, ne, mean_outcomes])
clear_output()
df = pd.DataFrame(results,
columns=('hiddens', 'batches', 'games',
'epochs', 'mean r'))
print(df.sort_values(by='mean r', ascending=False))
if mean_outcomes > previous_best:
previous_best = mean_outcomes
with open('best_ttt_agents.pkl', 'wb') as f:
pickle.dump(agents, f)
hiddens batches games epochs mean r 43 [50] 500 5 10 0.4244 41 [50, 50, 50] 500 5 5 0.4168 2 [50, 50, 50] 100 5 2 0.3840 65 [50, 50, 50] 500 20 5 0.3734 61 [50] 500 20 2 0.3726 .. ... ... ... ... ... 56 [50, 50, 50] 500 10 10 0.1780 30 [] 100 20 10 0.1750 33 [] 100 20 20 0.1575 15 [] 100 10 5 0.1520 68 [50, 50, 50] 500 20 10 0.1440 [72 rows x 5 columns]
with open('best_ttt_agents.pkl', 'rb') as f:
agents = pickle.load(f)
ttt = agents['X'].env
game = Game(ttt, agents)
rs = []
for n_games in range(100):
rs.append(game.play_game(epsilon=0.05, verbose=False))
print(f'mean of final outcomes {np.mean(rs)}')
mean of final outcomes 0.35
game.play_game_show_Q()
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output
import pandas as pd
import pickle
import robot
class Experiment:
def __init__(self, environment, agent):
self.env = environment
self.agent = agent
def train(self, parms, verbose=True):
n_batches = parms['n_batches']
n_steps_per_batch = parms['n_steps_per_batch']
n_epochs = parms['n_epochs']
method = parms['method']
learning_rate = parms['learning_rate']
epsilon = parms['initial_epsilon']
final_epsilon = parms['final_epsilon']
gamma = parms['gamma']
env = self.env
epsilon_decay = np.exp((np.log(final_epsilon) - np.log(epsilon))/ (n_batches)) # to produce this final value
epsilon_trace = []
outcomes = []
for batch in range(n_batches):
agent.clear_samples()
env.initialize()
sum_rs = 0
for step in range(n_steps_per_batch):
obs = self.env.observe()
action = agent.epsilon_greedy(epsilon)
env.act(action)
r = env.reinforcement()
sum_rs += r
done = step == n_steps_per_batch - 1
agent.add_sample(obs, action, r, done)
outcomes.append(sum_rs / n_steps_per_batch)
self.agent.train(n_epochs, method, learning_rate, gamma)
epsilon_trace.append(epsilon)
epsilon *= epsilon_decay
if verbose and (len(outcomes) % (n_batches // 20) == 0):
print(f'{len(outcomes)}/{n_batches} batches, {np.mean(outcomes):.4f} outcome mean')
if verbose:
plt.figure(1)
plt.clf()
plt.subplot(2, 1, 1)
n_per = 10
n_bins = len(outcomes) // n_per
outcomes_binned = np.array(outcomes).reshape(-1, n_per)
avgs = outcomes_binned.mean(1)
xs = np.linspace(n_per, n_per * n_bins, len(avgs))
plt.plot(xs, avgs)
plt.axhline(y=0, color='orange', ls='--')
plt.ylabel('R')
plt.subplot(2, 1, 2)
plt.plot(epsilon_trace)
plt.ylabel(r'$\epsilon$')
#plt.pause(0.1)
return outcomes # , epsilon_trace
def test(self, n_trials, n_steps, epsilon=0.0, graphics=True):
if graphics:
fig = plt.figure(figsize=(10, 10))
robot = self.env
sum_rs = 0
for trial in range(n_trials):
robot.initialize()
agent = self.agent
points = np.zeros((n_steps, robot.n_links + 1, 2))
actions = np.zeros((n_steps, robot.n_links))
Q_values = np.zeros((n_steps))
for i in range(n_steps):
action = agent.epsilon_greedy(epsilon)
Q = agent.use(np.hstack((robot.observe(), action)))
self.env.act(action)
sum_rs += self.env.reinforcement()
points[i] = robot.points
actions[i] = action
Q_values[i] = Q[0, 0]
if graphics:
Q_min, Q_max = np.min(Q_values), np.max(Q_values)
print(Q_min, Q_max)
for i in range(n_steps):
fig.clf()
plt.scatter(robot.goal[0], robot.goal[1], s=40, c='blue')
action = actions[i]
robot.set_points(points[i])
robot.draw() # alpha=(Q_values[i] - Q_min) / (Q_max - Q_min))
clear_output(wait=True)
display(fig)
clear_output(wait=True)
return sum_rs / (n_trials * n_steps)
robbie = robot.Robot()
robbie.set_goal([5., 5.])
agent = robot.QnetAgent(robbie, [100, 100, 100])
experiment = Experiment(robbie, agent)
import pandas as pd
previous_best = -np.inf
results = []
for nb in [100, 200, 5000]:
for ns in [50, 100]:
for ne in [5, 10]:
for nh in [ [], [50], [50, 50] ]:
parms = {
'n_batches': nb,
'n_steps_per_batch': ns,
'n_epochs': ne,
'method': 'scg',
'learning_rate': 0.01,
'initial_epsilon': 1.0,
'final_epsilon': 0.001,
'gamma': 1.0
}
agent = robot.QnetAgent(robbie, nh)
experiment = Experiment(robbie, agent)
outcomes = experiment.train(parms, verbose=False)
results.append([nh, nb, ns, ne, outcomes[-1]])
clear_output()
df = pd.DataFrame(results,
columns=('hiddens', 'batches', 'steps',
'epochs', 'dist'))
print(df.sort_values(by='dist', ascending=False))
if outcomes[-1] > previous_best:
previous_best = outcomes[-1]
with open('best_robot_agent.pkl', 'wb') as f:
pickle.dump(agent, f)
print()
hiddens batches steps epochs dist 13 [50] 200 50 5 -1.814330 17 [50, 50] 200 50 10 -4.569909 1 [50] 100 50 5 -5.063843 16 [50] 200 50 10 -5.805537 5 [50, 50] 100 50 10 -5.866438 4 [50] 100 50 10 -7.114635 18 [] 200 100 5 -7.848883 7 [50] 100 100 5 -7.996590 6 [] 100 100 5 -8.317250 0 [] 100 50 5 -8.321376 3 [] 100 50 10 -8.546975 10 [50] 100 100 10 -8.749536 12 [] 200 50 5 -8.816075 2 [50, 50] 100 50 5 -8.881165 15 [] 200 50 10 -9.246151 9 [] 100 100 10 -9.302054 11 [50, 50] 100 100 10 -10.096282 8 [50, 50] 100 100 5 -10.254745 14 [50, 50] 200 50 5 -10.688630
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[11], line 23 20 agent = robot.QnetAgent(robbie, nh) 21 experiment = Experiment(robbie, agent) ---> 23 outcomes = experiment.train(parms, verbose=False) 24 results.append([nh, nb, ns, ne, outcomes[-1]]) 25 clear_output() Cell In[2], line 35, in Experiment.train(self, parms, verbose) 32 for step in range(n_steps_per_batch): 34 obs = self.env.observe() ---> 35 action = agent.epsilon_greedy(epsilon) 37 env.act(action) 38 r = env.reinforcement() File ~/public_html/cs545/notebooks/tmp/robot.py:119, in QnetAgent.epsilon_greedy(self, epsilon) 117 np.random.shuffle(actions) 118 obs = self.env.observe() --> 119 Qs = np.array([self.use(np.hstack((obs, a))) for a in actions]) 120 action = actions[np.argmax(Qs)] # Minimize sum of distances to goal 121 return action File ~/public_html/cs545/notebooks/tmp/robot.py:126, in QnetAgent.use(self, X) 124 if X.ndim == 1: 125 X = X.reshape(1, -1) --> 126 return self.Qnet.use(X) File ~/public_html/cs545/notebooks/tmp/neuralnetworksA4.py:410, in NeuralNetwork.use(self, X) 408 # Standardize X 409 X = (X - self.X_means) / self.X_stds --> 410 Zs = self._forward(X) 411 # Unstandardize output Y before returning it 412 return Zs[-1] * self.T_stds + self.T_means File ~/public_html/cs545/notebooks/tmp/neuralnetworksA4.py:301, in NeuralNetwork._forward(self, X) 298 # Append output of each layer to list in self.Zs, then return it. 300 for W in self.Ws[:-1]: # forward through all but last layer --> 301 self.Zs.append(np.tanh(self._add_ones(self.Zs[-1]) @ W)) 302 last_W = self.Ws[-1] 303 self.Zs.append(self._add_ones(self.Zs[-1]) @ last_W) File ~/public_html/cs545/notebooks/tmp/neuralnetworksA4.py:282, in NeuralNetwork._add_ones(self, X) 281 def _add_ones(self, X): --> 282 return np.insert(X, 0, 1, 1) File ~/anaconda3/lib/python3.12/site-packages/numpy/lib/function_base.py:5369, in insert(arr, obj, values, axis) 5365 def _insert_dispatcher(arr, obj, values, axis=None): 5366 return (arr, obj, values) -> 5369 @array_function_dispatch(_insert_dispatcher) 5370 def insert(arr, obj, values, axis=None): 5371 """ 5372 Insert values along the given axis before the given indices. 5373 (...) 5456 5457 """ 5458 wrap = None KeyboardInterrupt:
with open('best_robot_agent.pkl', 'rb') as f:
agent = pickle.load(f)
robbie = agent.env
experiment = Experiment(robbie, agent)
mean_r = experiment.test(n_trials=10, n_steps=100, epsilon=0.0, graphics=False)
print(f'mean of reinforcements {mean_r:.3f}')
mean of reinforcements -2.424
parms = {
'n_batches': 500,
'n_steps_per_batch': 50,
'n_epochs': 5,
'method': 'scg',
'learning_rate': 0.01,
'initial_epsilon': 1.0,
'final_epsilon': 0.0001,
'gamma': 1.0
}
agent = robot.QnetAgent(robbie, [50])
experiment = Experiment(robbie, agent)
outcomes = experiment.train(parms)
25/500 batches, -7.9906 outcome mean 50/500 batches, -7.8550 outcome mean 75/500 batches, -7.6936 outcome mean 100/500 batches, -7.4452 outcome mean 125/500 batches, -7.2722 outcome mean 150/500 batches, -7.0897 outcome mean 175/500 batches, -6.8215 outcome mean 200/500 batches, -6.7659 outcome mean 225/500 batches, -6.6292 outcome mean 250/500 batches, -6.4414 outcome mean 275/500 batches, -6.2562 outcome mean 300/500 batches, -6.1209 outcome mean 325/500 batches, -5.9951 outcome mean 350/500 batches, -5.9060 outcome mean 375/500 batches, -5.8354 outcome mean 400/500 batches, -5.7448 outcome mean 425/500 batches, -5.6290 outcome mean 450/500 batches, -5.5753 outcome mean 475/500 batches, -5.5057 outcome mean 500/500 batches, -5.4142 outcome mean
experiment.test(10, 100, epsilon=0.0, graphics=True)
-3.4894459784287455